package compiler.lexer;

import compiler.constant.LexerConstant;
import compiler.enums.Token;

/**
 * Created by szj on 2017/5/7.
 */
public class Lexer {

    public static final int EOI = 0;
    public static final int SEMI = 1;
    public static final int PLUS = 2;
    public static final int TIMES = 3;
    public static final int LP = 4;
    public static final int RP = 5;
    public static final int NUM_OR_ID = 6;
    public static final int WHITE_SPACE = 7;
    public static final int UNKNOWN_SYMBOL = 8;


    private Token[] tokenMap = new Token[LexerConstant.ASCII_COUNT];
    private Token currentToken = Token.EOS;
    private RegularExpressionHandler exprHandler = null;
    private int exprCount = 0;
    private int charIndex = 0;
    private String curExpr = "";
    private boolean inQuoted = false; //是否在双引号内
    private boolean sawEscape = false;  //是否读取到转移符 /
    private int lexeme;

    public Lexer(RegularExpressionHandler exprHandler) {
        initTokenMap();
        this.exprHandler = exprHandler;
    }

    private void initTokenMap() {
        for (int i = 0; i < LexerConstant.ASCII_COUNT; i++) {
            tokenMap[i] = Token.L;
        }
        tokenMap['.'] = Token.ANY;
        tokenMap['^'] = Token.AT_BOL;
        tokenMap['$'] = Token.AT_EOL;
        tokenMap['['] = Token.CCL_START;
        tokenMap[']'] = Token.CCL_END;
        tokenMap['{'] = Token.OPEN_CURLY;
        tokenMap['}'] = Token.CLOSE_CURLY;
        tokenMap['('] = Token.OPEN_PAREN;
        tokenMap[')'] = Token.CLOSE_PAREN;
        tokenMap['*'] = Token.CLOSURE;
        tokenMap['-'] = Token.DASH;
        tokenMap['?'] = Token.OPTIONAL;
        tokenMap['|'] = Token.OR;
        tokenMap['+'] = Token.PLUS_CLOSE;
    }

    public boolean MatchToken(Token t) {
        return currentToken == t;
    }

    public int getLexeme() {
        return lexeme;
    }

    public String getCurExpr() {
        return curExpr;
    }

    public Token getCurrentToken() {
        return currentToken;
    }

    public Token advance() {

        if (currentToken == Token.EOS) {

            if (exprCount == exprHandler.getRegularExpressionCount()) {
                //所有正则表达式都解析完毕
                currentToken = Token.END_OF_INPUT;
                return currentToken;
            }

            //一个正则表达式解析结束后读入下一个表达式
            curExpr = exprHandler.getRegularExpression(exprCount);
            exprCount++;
            System.out.println("当前正则解析的正则表达式: " + curExpr);
        }

        if (charIndex == curExpr.length()) {
            currentToken = Token.EOS;
            charIndex = 0;
            return currentToken;
        }

        if (curExpr.charAt(charIndex) == '"') {
            inQuoted = !inQuoted;
            charIndex++;
        }

        sawEscape = (curExpr.charAt(charIndex) == '\\');

        if (sawEscape) {
            if (!inQuoted && curExpr.charAt(charIndex + 1) != '"') {
                //转义字符，并且不在""内
                lexeme = handleEscape();
                currentToken = Token.L;
                return currentToken;
            }

            if (curExpr.charAt(charIndex + 1) == '"') {
                //转义字符，并且在下一个字符是"
                lexeme = '"';
                charIndex += 2;
                currentToken = Token.L;
                return currentToken;
            }
        }

        lexeme = curExpr.charAt(charIndex++);

        currentToken = inQuoted
                ? Token.L
                : tokenMap[lexeme];

        printLexResult();

        return currentToken;
    }

    private void printLexResult() {
        System.out.println("当前识别字符是: " + (char) getLexeme());

        if (!MatchToken(Token.L)) {
            System.out.println("当前字符具有特殊含义");
            printMetaCharMeaning();
            return;
        }

        System.out.println("当前字符是普通字符常量");
    }

    private void printMetaCharMeaning() {
        String s = "";
        if (MatchToken(Token.ANY)) {
            s = "当前字符是点通配符";
        }

        if (MatchToken(Token.AT_BOL)) {
            s = "当前字符是开头匹配符";
        }

        if (MatchToken(Token.AT_EOL)) {
            s = "当前字符是末尾匹配符";
        }

        if (MatchToken(Token.CCL_END)) {
            s = "当前字符是字符集类结尾括号";
        }

        if (MatchToken(Token.CCL_START)) {
            s = "当前字符是字符集类的开始括号";
        }

        if (MatchToken(Token.CLOSE_CURLY)) {
            s = "当前字符是结尾大括号";
        }

        if (MatchToken(Token.CLOSE_PAREN)) {
            s = "当前字符是结尾圆括号";
        }

        if (MatchToken(Token.DASH)) {
            s = "当前字符是横杆";
        }

        if (MatchToken(Token.OPEN_CURLY)) {
            s = "当前字符是起始大括号";
        }

        if (MatchToken(Token.OPEN_PAREN)) {
            s = "当前字符是起始圆括号";
        }

        if (MatchToken(Token.OPTIONAL)) {
            s = "当前字符是单字符匹配符?";
        }

        if (MatchToken(Token.OR)) {
            s = "当前字符是或操作符";
        }

        if (MatchToken(Token.PLUS_CLOSE)) {
            s = "当前字符是正闭包操作符";
        }

        if (MatchToken(Token.CLOSURE)) {
            s = "当前字符是闭包操作符";
        }

        System.out.println(s);
    }

    private int handleEscape() {
        /*当转移符 \ 存在时，它必须与跟在它后面的字符或字符串一起解读
         *我们处理的转义字符有以下几种形式
    	 * \b backspace
    	 * \f formfeed
    	 * \n newline
    	 * \r carriage return 回车
    	 * \s space 空格
    	 * \t tab
    	 * \e ASCII ESC ('\033')
    	 * \DDD 3位八进制数
    	 * \xDDD 3位十六进制数
    	 * \^C C是任何字符， 例如^A, ^B 在Ascii 表中都有对应的特殊含义
    	 * ASCII 字符表参见：
    	 * http://baike.baidu.com/pic/%E7%BE%8E%E5%9B%BD%E4%BF%A1%E6%81%AF%E4%BA%A4%E6%8D%A2%E6%A0%87%E5%87%86%E4%BB%A3%E7%A0%81/8950990/0/9213b07eca8065387d4c671896dda144ad348213?fr=lemma&ct=single#aid=0&pic=9213b07eca8065387d4c671896dda144ad348213
    	 */

        int rval = 0;
        String exprToUpper = curExpr.toUpperCase();
        charIndex++; //越过转移符 \
        switch (exprToUpper.charAt(charIndex)) {
            case '\0':
                rval = '\\';
                break;
            case 'B':
                rval = '\b';
                break;
            case 'F':
                rval = '\f';
                break;
            case 'N':
                rval = '\n';
                break;
            case 'R':
                rval = '\r';
                break;
            case 'S':
                rval = ' ';
                break;
            case 'T':
                rval = '\t';
                break;
            case 'E':
                rval = '\033';
                break;
            case '^':
                charIndex++;
              /*
               * 因此当遇到^后面跟在一个字母时，表示读入的是控制字符
    		   * ^@ 在ASCII 表中的数值为0，^A 为1, 字符@在ASCII 表中数值为80， 字符A在ASCII表中数值为81
    		   * 'A' - '@' 等于1 就对应 ^A 在 ASCII 表中的位置
    		   * 具体可参看注释给出的ASCII 图
    		   *
    		   */
                rval = (char) (curExpr.charAt(charIndex) - '@');
                break;
            case 'X':
            /*
             * \X 表示后面跟着的三个字符表示八进制或十六进制数
    		 */
                charIndex++; //越过X
                if (isHexDigit(curExpr.charAt(charIndex))) {
                    rval = hex2Bin(curExpr.charAt(charIndex));
                    charIndex++;
                }

                if (isHexDigit(curExpr.charAt(charIndex))) {
                    rval <<= 4;
                    rval |= hex2Bin(curExpr.charAt(charIndex));
                    charIndex++;
                }

                if (isHexDigit(curExpr.charAt(charIndex))) {
                    rval <<= 4;
                    rval |= hex2Bin(curExpr.charAt(charIndex));
                    charIndex++;
                }
                charIndex--; //由于在函数底部会对charIndex++ 所以这里先 --
                break;

            default:
                if (!isOctDigit(curExpr.charAt(charIndex))) {
                    rval = curExpr.charAt(charIndex);
                } else {
                    charIndex++;
                    rval = oct2Bin(curExpr.charAt(charIndex));
                    charIndex++;
                    if (isOctDigit(curExpr.charAt(charIndex))) {
                        rval <<= 3;
                        rval |= oct2Bin(curExpr.charAt(charIndex));
                        charIndex++;
                    }

                    if (isOctDigit(curExpr.charAt(charIndex))) {
                        rval <<= 3;
                        rval |= oct2Bin(curExpr.charAt(charIndex));
                        charIndex++;
                    }

                    charIndex--;//由于在函数底部会对charIndex++ 所以这里先 --
                }
        }

        charIndex++;
        return rval;
    }

    private int hex2Bin(char c) {
        /*
         * 将十六进制数对应的字符转换为对应的数值，例如
    	 * A 转换为10， B转换为11
    	 * 字符c 必须满足十六进制字符： 0123456789ABCDEF
    	 */
        return (Character.isDigit(c)
                ? (c) - '0'
                : (Character.toUpperCase(c) - 'A' + 10)) & 0xf;
    }

    private int oct2Bin(char c) {
        /*
         * 将字符c 转换为对应的八进制数
    	 * 字符c 必须是合法的八进制字符: 01234567
    	 */
        return ((c) - '0') & 0x7;
    }

    private boolean isHexDigit(char c) {
        return (Character.isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'));
    }

    private boolean isOctDigit(char c) {
        return ('0' <= c && c <= '7');
    }
}
